import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#To read CSV file
df = pd.read_csv("C:/Users/HARISHKUMAR/Downloads/breast-cancer.csv")
df
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | ... | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | ... | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | ... | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | ... | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | ... | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 926424 | M | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | ... | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 | NaN |
| 565 | 926682 | M | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | ... | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 | NaN |
| 566 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | ... | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 | NaN |
| 567 | 927241 | M | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | ... | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 | NaN |
| 568 | 92751 | B | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | ... | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 | NaN |
569 rows × 33 columns
df.shape
(569, 33)
df['diagnosis'] = df['diagnosis'].map({'M':1, 'B':0})
df.head()
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | 1 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | 1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | 1 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | 1 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | 1 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
5 rows × 33 columns
df.tail()
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 564 | 926424 | 1 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | ... | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 | NaN |
| 565 | 926682 | 1 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | ... | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 | NaN |
| 566 | 926954 | 1 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | ... | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 | NaN |
| 567 | 927241 | 1 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | ... | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 | NaN |
| 568 | 92751 | 0 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | ... | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 | NaN |
5 rows × 33 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null int64 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 32 Unnamed: 32 0 non-null float64 dtypes: float64(31), int64(2) memory usage: 146.8 KB
df.isnull().sum()
id 0 diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 concave points_mean 0 symmetry_mean 0 fractal_dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 concave points_se 0 symmetry_se 0 fractal_dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 concave points_worst 0 symmetry_worst 0 fractal_dimension_worst 0 Unnamed: 32 569 dtype: int64
df.nunique()
id 569 diagnosis 2 radius_mean 456 texture_mean 479 perimeter_mean 522 area_mean 539 smoothness_mean 474 compactness_mean 537 concavity_mean 537 concave points_mean 542 symmetry_mean 432 fractal_dimension_mean 499 radius_se 540 texture_se 519 perimeter_se 533 area_se 528 smoothness_se 547 compactness_se 541 concavity_se 533 concave points_se 507 symmetry_se 498 fractal_dimension_se 545 radius_worst 457 texture_worst 511 perimeter_worst 514 area_worst 544 smoothness_worst 411 compactness_worst 529 concavity_worst 539 concave points_worst 492 symmetry_worst 500 fractal_dimension_worst 535 Unnamed: 32 0 dtype: int64
df.dtypes
id int64 diagnosis int64 radius_mean float64 texture_mean float64 perimeter_mean float64 area_mean float64 smoothness_mean float64 compactness_mean float64 concavity_mean float64 concave points_mean float64 symmetry_mean float64 fractal_dimension_mean float64 radius_se float64 texture_se float64 perimeter_se float64 area_se float64 smoothness_se float64 compactness_se float64 concavity_se float64 concave points_se float64 symmetry_se float64 fractal_dimension_se float64 radius_worst float64 texture_worst float64 perimeter_worst float64 area_worst float64 smoothness_worst float64 compactness_worst float64 concavity_worst float64 concave points_worst float64 symmetry_worst float64 fractal_dimension_worst float64 Unnamed: 32 float64 dtype: object
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| id | 569.0 | 3.037183e+07 | 1.250206e+08 | 8670.000000 | 869218.000000 | 906024.000000 | 8.813129e+06 | 9.113205e+08 |
| diagnosis | 569.0 | 3.725835e-01 | 4.839180e-01 | 0.000000 | 0.000000 | 0.000000 | 1.000000e+00 | 1.000000e+00 |
| radius_mean | 569.0 | 1.412729e+01 | 3.524049e+00 | 6.981000 | 11.700000 | 13.370000 | 1.578000e+01 | 2.811000e+01 |
| texture_mean | 569.0 | 1.928965e+01 | 4.301036e+00 | 9.710000 | 16.170000 | 18.840000 | 2.180000e+01 | 3.928000e+01 |
| perimeter_mean | 569.0 | 9.196903e+01 | 2.429898e+01 | 43.790000 | 75.170000 | 86.240000 | 1.041000e+02 | 1.885000e+02 |
| area_mean | 569.0 | 6.548891e+02 | 3.519141e+02 | 143.500000 | 420.300000 | 551.100000 | 7.827000e+02 | 2.501000e+03 |
| smoothness_mean | 569.0 | 9.636028e-02 | 1.406413e-02 | 0.052630 | 0.086370 | 0.095870 | 1.053000e-01 | 1.634000e-01 |
| compactness_mean | 569.0 | 1.043410e-01 | 5.281276e-02 | 0.019380 | 0.064920 | 0.092630 | 1.304000e-01 | 3.454000e-01 |
| concavity_mean | 569.0 | 8.879932e-02 | 7.971981e-02 | 0.000000 | 0.029560 | 0.061540 | 1.307000e-01 | 4.268000e-01 |
| concave points_mean | 569.0 | 4.891915e-02 | 3.880284e-02 | 0.000000 | 0.020310 | 0.033500 | 7.400000e-02 | 2.012000e-01 |
| symmetry_mean | 569.0 | 1.811619e-01 | 2.741428e-02 | 0.106000 | 0.161900 | 0.179200 | 1.957000e-01 | 3.040000e-01 |
| fractal_dimension_mean | 569.0 | 6.279761e-02 | 7.060363e-03 | 0.049960 | 0.057700 | 0.061540 | 6.612000e-02 | 9.744000e-02 |
| radius_se | 569.0 | 4.051721e-01 | 2.773127e-01 | 0.111500 | 0.232400 | 0.324200 | 4.789000e-01 | 2.873000e+00 |
| texture_se | 569.0 | 1.216853e+00 | 5.516484e-01 | 0.360200 | 0.833900 | 1.108000 | 1.474000e+00 | 4.885000e+00 |
| perimeter_se | 569.0 | 2.866059e+00 | 2.021855e+00 | 0.757000 | 1.606000 | 2.287000 | 3.357000e+00 | 2.198000e+01 |
| area_se | 569.0 | 4.033708e+01 | 4.549101e+01 | 6.802000 | 17.850000 | 24.530000 | 4.519000e+01 | 5.422000e+02 |
| smoothness_se | 569.0 | 7.040979e-03 | 3.002518e-03 | 0.001713 | 0.005169 | 0.006380 | 8.146000e-03 | 3.113000e-02 |
| compactness_se | 569.0 | 2.547814e-02 | 1.790818e-02 | 0.002252 | 0.013080 | 0.020450 | 3.245000e-02 | 1.354000e-01 |
| concavity_se | 569.0 | 3.189372e-02 | 3.018606e-02 | 0.000000 | 0.015090 | 0.025890 | 4.205000e-02 | 3.960000e-01 |
| concave points_se | 569.0 | 1.179614e-02 | 6.170285e-03 | 0.000000 | 0.007638 | 0.010930 | 1.471000e-02 | 5.279000e-02 |
| symmetry_se | 569.0 | 2.054230e-02 | 8.266372e-03 | 0.007882 | 0.015160 | 0.018730 | 2.348000e-02 | 7.895000e-02 |
| fractal_dimension_se | 569.0 | 3.794904e-03 | 2.646071e-03 | 0.000895 | 0.002248 | 0.003187 | 4.558000e-03 | 2.984000e-02 |
| radius_worst | 569.0 | 1.626919e+01 | 4.833242e+00 | 7.930000 | 13.010000 | 14.970000 | 1.879000e+01 | 3.604000e+01 |
| texture_worst | 569.0 | 2.567722e+01 | 6.146258e+00 | 12.020000 | 21.080000 | 25.410000 | 2.972000e+01 | 4.954000e+01 |
| perimeter_worst | 569.0 | 1.072612e+02 | 3.360254e+01 | 50.410000 | 84.110000 | 97.660000 | 1.254000e+02 | 2.512000e+02 |
| area_worst | 569.0 | 8.805831e+02 | 5.693570e+02 | 185.200000 | 515.300000 | 686.500000 | 1.084000e+03 | 4.254000e+03 |
| smoothness_worst | 569.0 | 1.323686e-01 | 2.283243e-02 | 0.071170 | 0.116600 | 0.131300 | 1.460000e-01 | 2.226000e-01 |
| compactness_worst | 569.0 | 2.542650e-01 | 1.573365e-01 | 0.027290 | 0.147200 | 0.211900 | 3.391000e-01 | 1.058000e+00 |
| concavity_worst | 569.0 | 2.721885e-01 | 2.086243e-01 | 0.000000 | 0.114500 | 0.226700 | 3.829000e-01 | 1.252000e+00 |
| concave points_worst | 569.0 | 1.146062e-01 | 6.573234e-02 | 0.000000 | 0.064930 | 0.099930 | 1.614000e-01 | 2.910000e-01 |
| symmetry_worst | 569.0 | 2.900756e-01 | 6.186747e-02 | 0.156500 | 0.250400 | 0.282200 | 3.179000e-01 | 6.638000e-01 |
| fractal_dimension_worst | 569.0 | 8.394582e-02 | 1.806127e-02 | 0.055040 | 0.071460 | 0.080040 | 9.208000e-02 | 2.075000e-01 |
| Unnamed: 32 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
sns.countplot(x='diagnosis', data=df)
plt.title('Distribution of Diagnosis')
plt.show()
numerical_columns = df.select_dtypes(exclude=['object']).columns
for col in numerical_columns:
sns.histplot(df[col], kde=True)
plt.title(f'Univariate Analysis of {col}')
plt.show()
sns.pairplot(df, hue = 'diagnosis', vars = ['radius_mean', 'texture_mean', 'area_mean', 'perimeter_mean', 'smoothness_mean'] )
<seaborn.axisgrid.PairGrid at 0x26c9c308880>
sns.set_style("darkgrid")
# Identify numerical columns
numerical_columns = df.select_dtypes(include=["int64", "float64"]).columns
# Plot distribution of each numerical feature
plt.figure(figsize=(14, len(numerical_columns) * 3))
for idx, feature in enumerate(numerical_columns, 1):
plt.subplot(len(numerical_columns), 2, idx)
sns.histplot(df[feature], kde=True)
plt.title(f"{feature} | Skewness: {round(df[feature].skew(), 2)}")
plt.tight_layout()
plt.show()
sns.scatterplot(x = 'area_mean', y = 'smoothness_mean', hue = 'diagnosis', data=df)
<Axes: xlabel='area_mean', ylabel='smoothness_mean'>
sns.scatterplot(x='concavity_se',y= 'radius_mean', hue ='diagnosis', data = df)
<Axes: xlabel='concavity_se', ylabel='radius_mean'>
sns.scatterplot(x='compactness_se', y='radius_mean', hue ='diagnosis', data = df)
<Axes: xlabel='compactness_se', ylabel='radius_mean'>
sns.lmplot(x='area_mean', y='smoothness_mean', data=df, hue='diagnosis', fit_reg=False)
<seaborn.axisgrid.FacetGrid at 0x26ca85e11e0>
sns.catplot(x='texture_mean',y='area_mean',hue='diagnosis',data=df)
<seaborn.axisgrid.FacetGrid at 0x26ca85e2560>
plt.figure(figsize=(24,12))
sns.heatmap(df.corr(), annot=True,fmt='.2f', linewidths=2)
plt.title('Correlation Heatmap')
plt.show()
numerical_cols = ['radius_mean', 'texture_mean', 'area_mean', 'perimeter_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
from sklearn.preprocessing import StandardScaler
# Scale the numerical columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.head()
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | 1 | 1.097064 | -2.073335 | 1.269934 | 0.984375 | 1.568466 | 3.283515 | 2.652874 | 2.532475 | ... | -1.359293 | 2.303601 | 2.001237 | 1.307686 | 2.616665 | 2.109526 | 2.296076 | 2.750622 | 1.937015 | NaN |
| 1 | 842517 | 1 | 1.829821 | -0.353632 | 1.685955 | 1.908708 | -0.826962 | -0.487072 | -0.023846 | 0.548144 | ... | -0.369203 | 1.535126 | 1.890489 | -0.375612 | -0.430444 | -0.146749 | 1.087084 | -0.243890 | 0.281190 | NaN |
| 2 | 84300903 | 1 | 1.579888 | 0.456187 | 1.566503 | 1.558884 | 0.942210 | 1.052926 | 1.363478 | 2.037231 | ... | -0.023974 | 1.347475 | 1.456285 | 0.527407 | 1.082932 | 0.854974 | 1.955000 | 1.152255 | 0.201391 | NaN |
| 3 | 84348301 | 1 | -0.768909 | 0.253732 | -0.592687 | -0.764464 | 3.283553 | 3.402909 | 1.915897 | 1.451707 | ... | 0.133984 | -0.249939 | -0.550021 | 3.394275 | 3.893397 | 1.989588 | 2.175786 | 6.046041 | 4.935010 | NaN |
| 4 | 84358402 | 1 | 1.750297 | -1.151816 | 1.776573 | 1.826229 | 0.280372 | 0.539340 | 1.371011 | 1.428493 | ... | -1.466770 | 1.338539 | 1.220724 | 0.220556 | -0.313395 | 0.613179 | 0.729259 | -0.868353 | -0.397100 | NaN |
5 rows × 33 columns
unwantedcolumnlist=["diagnosis","Unnamed: 32","id"]
X = df.drop(unwantedcolumnlist,axis=1)
y = df['diagnosis']
simple_random_sample = df.sample(n=60, random_state=42)
print("Simple Random Sampling:")
print(simple_random_sample.head())
Simple Random Sampling:
id diagnosis radius_mean texture_mean perimeter_mean area_mean \
204 87930 0 -0.470694 -0.160486 -0.448110 -0.491999
70 859575 1 1.366877 0.470149 1.302886 1.351264
131 8670 1 0.378508 0.044296 0.400820 0.267377
431 907915 0 -0.490575 -0.374576 -0.432457 -0.532101
540 921385 0 -0.734828 -1.128546 -0.713374 -0.716683
smoothness_mean compactness_mean concavity_mean concave points_mean \
204 0.234114 0.027651 -0.109847 -0.276232
70 -0.446227 -0.027309 0.241064 0.789060
131 0.913744 0.340350 0.725686 0.824140
431 0.643316 0.516599 -0.142993 -0.539846
540 0.247636 0.145150 -0.269044 -0.592724
... texture_worst perimeter_worst area_worst smoothness_worst \
204 ... -0.168905 -0.333935 -0.356299 0.448503
70 ... 0.147012 1.746605 1.732277 -0.572873
131 ... 0.052562 0.525386 0.484159 0.974533
431 ... -0.450625 -0.525756 -0.641257 0.553709
540 ... -0.976611 -0.848337 -0.743216 0.093432
compactness_worst concavity_worst concave points_worst symmetry_worst \
204 -0.104741 -0.024412 -0.199563 0.183204
70 -0.131459 -0.016736 0.978975 -0.565828
131 -0.094562 0.512911 0.560244 -0.103143
431 0.054930 -0.152986 -0.622863 -0.557739
540 -0.270137 -0.443716 -0.691687 -0.924975
fractal_dimension_worst Unnamed: 32
204 0.196958 NaN
70 -1.000578 NaN
131 -0.208132 NaN
431 0.534440 NaN
540 -0.144403 NaN
[5 rows x 33 columns]
strata = df['diagnosis'].unique()
stratified_sample = df.groupby('diagnosis').apply(lambda x: x.sample(n=50, random_state=42))
print("Stratified Sampling:")
print(stratified_sample.head())
Stratified Sampling:
id diagnosis radius_mean texture_mean perimeter_mean \
diagnosis
0 395 903811 0 -0.019112 -0.490929 -0.091402
110 864033 0 -1.235545 -0.535144 -1.213835
481 91227 0 -0.064554 -0.011554 -0.133416
493 914101 0 -0.473535 -1.503204 -0.541199
136 868223 0 -0.686545 -0.609610 -0.710491
area_mean smoothness_mean compactness_mean concavity_mean \
diagnosis
0 395 -0.130229 -1.132262 -0.961427 -0.778274
110 -1.037213 0.522334 -0.384734 -0.570740
481 -0.147862 -1.170692 -0.968060 -0.738851
493 -0.505082 -1.611206 -1.211208 -1.024816
136 -0.657810 0.621966 -0.822323 -0.663898
concave points_mean ... texture_worst perimeter_worst \
diagnosis ...
0 395 -0.423257 ... -0.054915 -0.322915
110 -0.803203 ... -0.685120 -1.059816
481 -0.727884 ... 0.120957 -0.085224
493 -0.965447 ... -1.517252 -0.715492
136 -0.591176 ... -0.032117 -0.628517
area_worst smoothness_worst compactness_worst \
diagnosis
0 395 -0.344697 -1.129589 -0.834393
110 -0.902834 0.628230 -0.494694
481 -0.088042 -1.138356 -0.717343
493 -0.609263 -1.664826 -1.205453
136 -0.586937 -0.230954 -0.963529
concavity_worst concave points_worst symmetry_worst \
diagnosis
0 395 -0.899960 -0.540487 -0.611126
110 -0.682153 -0.932876 -0.594948
481 -0.503205 -0.504095 -0.881295
493 -1.225520 -1.336990 -1.004247
136 -0.804010 -0.684074 -1.923146
fractal_dimension_worst Unnamed: 32
diagnosis
0 395 -0.989495 NaN
110 0.040685 NaN
481 -0.438661 NaN
493 -0.757302 NaN
136 -0.582743 NaN
[5 rows x 33 columns]
systematic_sample = df.iloc[::10, :]
print("Systematic Sampling:")
print(systematic_sample.head())
Systematic Sampling:
id diagnosis radius_mean texture_mean perimeter_mean area_mean \
0 842302 1 1.097064 -2.073335 1.269934 0.984375
10 845636 1 0.537556 0.919273 0.442011 0.406453
20 8510653 0 -0.297446 -0.833008 -0.261106 -0.383638
30 853401 1 1.278833 1.354435 1.352314 1.231812
40 855167 1 -0.195201 0.532980 -0.238451 -0.261342
smoothness_mean compactness_mean concavity_mean concave points_mean \
0 1.568466 3.283515 2.652874 2.532475
10 -1.017686 -0.713542 -0.700684 -0.404686
20 0.792763 0.429422 -0.541362 -0.459627
30 0.714481 1.598728 1.796625 1.946952
40 -1.048999 -0.834452 -0.724413 -0.737944
... texture_worst perimeter_worst area_worst smoothness_worst \
0 ... -1.359293 2.303601 2.001237 1.307686
10 ... 1.335771 0.492622 0.473611 -0.625477
20 ... -0.844707 -0.332744 -0.439624 -0.051226
30 ... 1.356941 1.585762 1.387726 0.733436
40 ... 0.744648 -0.141817 -0.162929 -1.006849
compactness_worst concavity_worst concave points_worst symmetry_worst \
0 2.616665 2.109526 2.296076 2.750622
10 -0.630828 -0.605872 -0.226210 0.076431
20 0.148443 -0.399099 -0.636110 0.458227
30 1.090566 1.636491 1.068812 0.878850
40 -0.317847 -0.305547 -0.051865 0.150849
fractal_dimension_worst Unnamed: 32
0 1.937015 NaN
10 0.031819 NaN
20 -0.117250 NaN
30 0.768849 NaN
40 -0.691912 NaN
[5 rows x 33 columns]
df['cluster'] = np.random.choice([0, 1], size=len(df))
cluster_sample = df.groupby('cluster').apply(lambda x: x.sample(frac=0.2, random_state=42))
print("Cluster Sampling:")
print(cluster_sample.head())
Cluster Sampling:
id diagnosis radius_mean texture_mean perimeter_mean \
cluster
0 78 8610862 1 1.719055 1.089149 2.130809
254 886226 1 1.511725 0.009390 1.422337
404 904969 0 -0.507616 -1.009865 -0.563442
282 89122 1 1.497524 -0.258223 1.451171
523 917896 0 -0.118517 -0.141869 -0.133416
area_mean smoothness_mean compactness_mean concavity_mean \
cluster
0 78 1.678336 2.294354 4.568425 3.598263
254 1.462184 0.508101 0.274020 0.616458
404 -0.528403 -0.678938 -1.111144 -0.850089
282 1.393926 0.522334 0.755387 0.926565
523 -0.238589 0.199243 0.050392 -0.438788
concave points_mean ... perimeter_worst area_worst \
cluster ...
0 78 2.875535 ... 1.877663 1.305104
254 0.954141 ... 1.663205 1.918616
404 -0.732011 ... -0.689578 -0.610845
282 1.179323 ... 1.344497 1.313894
523 -0.286033 ... -0.233259 -0.314109
smoothness_worst compactness_worst concavity_worst \
cluster
0 78 1.382207 2.303684 2.379147
254 0.759738 0.393357 0.765260
404 -1.208494 -1.188468 -1.069745
282 0.851793 0.767407 0.764781
523 0.444119 0.014854 -0.377510
concave points_worst symmetry_worst fractal_dimension_worst \
cluster
0 78 2.073768 4.107940 0.869706
254 1.298734 0.773694 0.307790
404 -1.015252 -0.975127 -1.341385
282 1.683967 1.115046 -0.336696
523 0.210032 -0.083729 0.352676
Unnamed: 32 cluster
cluster
0 78 NaN 0
254 NaN 0
404 NaN 0
282 NaN 0
523 NaN 0
[5 rows x 34 columns]
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
summary_stats = df.describe()
correlation_matrix = df.corr()
sns.countplot(x='diagnosis', data=df)
plt.title('Distribution of Diagnosis (0: B, 1: M)')
plt.show()
df = df.dropna(axis=1)
print("DataFrame after dropping columns with missing data:")
print(df)
DataFrame after dropping columns with missing data:
id diagnosis radius_mean texture_mean perimeter_mean \
0 842302 1 1.097064 -2.073335 1.269934
1 842517 1 1.829821 -0.353632 1.685955
2 84300903 1 1.579888 0.456187 1.566503
3 84348301 1 -0.768909 0.253732 -0.592687
4 84358402 1 1.750297 -1.151816 1.776573
.. ... ... ... ... ...
564 926424 1 2.110995 0.721473 2.060786
565 926682 1 1.704854 2.085134 1.615931
566 926954 1 0.702284 2.045574 0.672676
567 927241 1 1.838341 2.336457 1.982524
568 92751 0 -1.808401 1.221792 -1.814389
area_mean smoothness_mean compactness_mean concavity_mean \
0 0.984375 1.568466 3.283515 2.652874
1 1.908708 -0.826962 -0.487072 -0.023846
2 1.558884 0.942210 1.052926 1.363478
3 -0.764464 3.283553 3.402909 1.915897
4 1.826229 0.280372 0.539340 1.371011
.. ... ... ... ...
564 2.343856 1.041842 0.219060 1.947285
565 1.723842 0.102458 -0.017833 0.693043
566 0.577953 -0.840484 -0.038680 0.046588
567 1.735218 1.525767 3.272144 3.296944
568 -1.347789 -3.112085 -1.150752 -1.114873
concave points_mean ... texture_worst perimeter_worst area_worst \
0 2.532475 ... -1.359293 2.303601 2.001237
1 0.548144 ... -0.369203 1.535126 1.890489
2 2.037231 ... -0.023974 1.347475 1.456285
3 1.451707 ... 0.133984 -0.249939 -0.550021
4 1.428493 ... -1.466770 1.338539 1.220724
.. ... ... ... ... ...
564 2.320965 ... 0.117700 1.752563 2.015301
565 1.263669 ... 2.047399 1.421940 1.494959
566 0.105777 ... 1.374854 0.579001 0.427906
567 2.658866 ... 2.237926 2.303601 1.653171
568 -1.261820 ... 0.764190 -1.432735 -1.075813
smoothness_worst compactness_worst concavity_worst \
0 1.307686 2.616665 2.109526
1 -0.375612 -0.430444 -0.146749
2 0.527407 1.082932 0.854974
3 3.394275 3.893397 1.989588
4 0.220556 -0.313395 0.613179
.. ... ... ...
564 0.378365 -0.273318 0.664512
565 -0.691230 -0.394820 0.236573
566 -0.809587 0.350735 0.326767
567 1.430427 3.904848 3.197605
568 -1.859019 -1.207552 -1.305831
concave points_worst symmetry_worst fractal_dimension_worst cluster
0 2.296076 2.750622 1.937015 0
1 1.087084 -0.243890 0.281190 1
2 1.955000 1.152255 0.201391 1
3 2.175786 6.046041 4.935010 1
4 0.729259 -0.868353 -0.397100 1
.. ... ... ... ...
564 1.629151 -1.360158 -0.709091 1
565 0.733827 -0.531855 -0.973978 0
566 0.414069 -1.104549 -0.318409 0
567 2.289985 1.919083 2.219635 0
568 -1.745063 -0.048138 -0.751207 0
[569 rows x 33 columns]
unwantedcolumnlist=["diagnosis","id"]
x= df.drop(unwantedcolumnlist,axis=1)
y = df['diagnosis']
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
model = LogisticRegression(random_state=42)
model.fit(x_train, y_train)
LogisticRegression(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(random_state=42)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.9736842105263158
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:','\n',cm)
Confusion Matrix: [[70 1] [ 2 41]]
sns.heatmap(cm, annot=True)
<Axes: >
print('Classification Report:','\n',classification_report(y_test, y_pred))
Classification Report:
precision recall f1-score support
0 0.97 0.99 0.98 71
1 0.98 0.95 0.96 43
accuracy 0.97 114
macro avg 0.97 0.97 0.97 114
weighted avg 0.97 0.97 0.97 114
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
ros = RandomOverSampler(random_state=42)
x_res, y_res = ros.fit_resample(x, y)
rus = RandomUnderSampler(random_state=42)
x_res, y_res = rus.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(x_res, y_res, test_size=0.2, random_state=42)
model = LogisticRegression(random_state=42)
model.fit(x_train, y_train)
LogisticRegression(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(random_state=42)
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
Accuracy: 0.9764705882352941
cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:','\n',cm)
Confusion Matrix: [[44 2] [ 0 39]]
print('Classification Report:','\n',classification_report(y_test, y_pred))
Classification Report:
precision recall f1-score support
0 1.00 0.96 0.98 46
1 0.95 1.00 0.97 39
accuracy 0.98 85
macro avg 0.98 0.98 0.98 85
weighted avg 0.98 0.98 0.98 85
This Classification Report suggests that the model has performed very well, with high precision, recall, and F1-scores for both classes, as well as a high overall accuracy of 0.98.